# Importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# Ignore harmless warnings
import warnings
warnings.filterwarnings("ignore")
# Set to display all the columns in dataset
pd.set_option("display.max_columns", None)
# Import psql to run queries
import pandasql as psql
# Load the dataset information
data = pd.read_excel(r"data(New).xlsx", header=0)
# Copy to back-up file
data_bk =data.copy()
# Display first 5 records
data.head()
| id | year | institute_type | round_no | quota | pool | institute_short | program_name | program_duration | degree_short | category | opening_rank | closing_rank | is_preparatory | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2016 | IIT | 6 | AI | Gender-Neutral | IIT-Bombay | Aerospace Engineering | 4 Years | B.Tech | GEN | 838 | 1841 | 0 |
| 1 | 2 | 2016 | IIT | 6 | AI | Gender-Neutral | IIT-Bombay | Aerospace Engineering | 4 Years | B.Tech | OBC-NCL | 408 | 1098 | 0 |
| 2 | 3 | 2016 | IIT | 6 | AI | Gender-Neutral | IIT-Bombay | Aerospace Engineering | 4 Years | B.Tech | SC | 297 | 468 | 0 |
| 3 | 4 | 2016 | IIT | 6 | AI | Gender-Neutral | IIT-Bombay | Aerospace Engineering | 4 Years | B.Tech | ST | 79 | 145 | 0 |
| 4 | 5 | 2016 | IIT | 6 | AI | Gender-Neutral | IIT-Bombay | Aerospace Engineering | 4 Years | B.Tech | GEN-PWD | 94 | 94 | 0 |
# Display the dataset information
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 64958 entries, 0 to 64957 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 64958 non-null int64 1 year 64958 non-null int64 2 institute_type 64958 non-null object 3 round_no 64958 non-null int64 4 quota 64958 non-null object 5 pool 64958 non-null object 6 institute_short 64958 non-null object 7 program_name 64958 non-null object 8 program_duration 64958 non-null object 9 degree_short 64958 non-null object 10 category 64958 non-null object 11 opening_rank 64958 non-null int64 12 closing_rank 64958 non-null int64 13 is_preparatory 64958 non-null int64 dtypes: int64(6), object(8) memory usage: 6.9+ MB
# display the unique values of the all the variables
data.nunique()
id 25458 year 6 institute_type 2 round_no 4 quota 7 pool 2 institute_short 54 program_name 130 program_duration 2 degree_short 13 category 10 opening_rank 10984 closing_rank 11940 is_preparatory 2 dtype: int64
# display the shape of the dataset
data.shape
(64958, 14)
# display the duplicated values with in dataset
data.duplicated().any()
True
# display duplicate values with in dataset
data_dup=data[data.duplicated(keep='last')]
# disply the duplicate records
data_dup
| id | year | institute_type | round_no | quota | pool | institute_short | program_name | program_duration | degree_short | category | opening_rank | closing_rank | is_preparatory | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9205 | 9206 | 2021 | IIT | 1 | AI | Gender-Neutral | IIT-Bombay | Aerospace Engineering | 4 Years | B.Tech | GEN | 123 | 2003 | 0 |
| 9206 | 9207 | 2021 | IIT | 1 | AI | Female-Only | IIT-Bombay | Aerospace Engineering | 4 Years | B.Tech | GEN | 702 | 4419 | 0 |
| 9207 | 9208 | 2021 | IIT | 1 | AI | Gender-Neutral | IIT-Bombay | Aerospace Engineering | 4 Years | B.Tech | OBC-NCL | 389 | 1123 | 0 |
| 9208 | 9209 | 2021 | IIT | 1 | AI | Female-Only | IIT-Bombay | Aerospace Engineering | 4 Years | B.Tech | OBC-NCL | 1618 | 2505 | 0 |
| 9209 | 9210 | 2021 | IIT | 1 | AI | Gender-Neutral | IIT-Bombay | Aerospace Engineering | 4 Years | B.Tech | SC | 129 | 579 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 64903 | 31136 | 2021 | NIT | 1 | JK | Female-Only | NIT-Srinagar | Electronics and Communication Engineering | 4 Years | B.Tech | SC | 14185 | 24048 | 0 |
| 64904 | 31137 | 2021 | NIT | 1 | JK | Gender-Neutral | NIT-Srinagar | Electronics and Communication Engineering | 4 Years | B.Tech | ST | 2736 | 4171 | 0 |
| 64905 | 31138 | 2021 | NIT | 1 | JK | Female-Only | NIT-Srinagar | Electronics and Communication Engineering | 4 Years | B.Tech | ST | 10870 | 10870 | 0 |
| 64906 | 31139 | 2021 | NIT | 1 | LA | Gender-Neutral | NIT-Srinagar | Electronics and Communication Engineering | 4 Years | B.Tech | GEN | 166453 | 265454 | 0 |
| 64907 | 31140 | 2021 | NIT | 1 | LA | Female-Only | NIT-Srinagar | Electronics and Communication Engineering | 4 Years | B.Tech | GEN | 215054 | 215054 | 0 |
39500 rows × 14 columns
# remove the identified duplicate records
data=data.drop_duplicates()
# display the shape of the dataset
data.shape
(25458, 14)
# Re-setting the raw index
data=data.reset_index(drop=True)
# copy file to backup file after deletion of duplicate records
data_bk2=data.copy()
# display the duplicated values in the dataset
data.duplicated().any()
False
# display the missing values information of variables
data.isnull().sum()
id 0 year 0 institute_type 0 round_no 0 quota 0 pool 0 institute_short 0 program_name 0 program_duration 0 degree_short 0 category 0 opening_rank 0 closing_rank 0 is_preparatory 0 dtype: int64
# display the descriptive status
data.describe()
| id | year | round_no | opening_rank | closing_rank | is_preparatory | |
|---|---|---|---|---|---|---|
| count | 25458.000000 | 25458.000000 | 25458.000000 | 2.545800e+04 | 2.545800e+04 | 25458.000000 |
| mean | 15065.188978 | 2019.524118 | 4.864993 | 8.347711e+03 | 1.100359e+04 | 0.035706 |
| std | 9630.192936 | 1.431272 | 2.530553 | 2.946525e+04 | 4.170573e+04 | 0.185559 |
| min | 1.000000 | 2016.000000 | 1.000000 | 0.000000e+00 | 0.000000e+00 | 0.000000 |
| 25% | 6365.250000 | 2019.000000 | 1.000000 | 6.550000e+02 | 8.260000e+02 | 0.000000 |
| 50% | 12729.500000 | 2020.000000 | 6.000000 | 2.237000e+03 | 2.715000e+03 | 0.000000 |
| 75% | 23803.750000 | 2021.000000 | 7.000000 | 6.781750e+03 | 8.155500e+03 | 0.000000 |
| max | 31140.000000 | 2021.000000 | 7.000000 | 1.082601e+06 | 1.144790e+06 | 1.000000 |
# To find outliers
first_quantile=data['id'].quantile(.25)
third_quantile=data['id'].quantile(.75)
IQR=third_quantile-first_quantile
upper_bound=round(third_quantile+1.5*IQR,3)
upper_bound
lower_bound=round(first_quantile-1.5*IQR,3)
lower_bound
data[(data.id < lower_bound) | (data.id > upper_bound)]
| id | year | institute_type | round_no | quota | pool | institute_short | program_name | program_duration | degree_short | category | opening_rank | closing_rank | is_preparatory |
|---|
# To find out the outliers
first_quantile=data['year'].quantile(.25)
third_quantile=data['year'].quantile(.75)
IQR=third_quantile-first_quantile
upper_bound=round(third_quantile+1.5*IQR,3)
upper_bound
lower_bound=round(first_quantile-1.5*IQR,3)
lower_bound
data[(data.year < lower_bound) | (data.year > upper_bound)]
| id | year | institute_type | round_no | quota | pool | institute_short | program_name | program_duration | degree_short | category | opening_rank | closing_rank | is_preparatory |
|---|
# To find outliers
first_quantile=data['round_no'].quantile(.25)
third_quantile=data['round_no'].quantile(.75)
IQR=third_quantile-first_quantile
upper_bound=round(third_quantile+1.5*IQR,3)
upper_bound
lower_bound=round(first_quantile-1.5*IQR,3)
lower_bound
data[(data.round_no < lower_bound) | (data.round_no > upper_bound)]
| id | year | institute_type | round_no | quota | pool | institute_short | program_name | program_duration | degree_short | category | opening_rank | closing_rank | is_preparatory |
|---|
# display the institute_type variables count
data['institute_type'].value_counts()
IIT 13155 NIT 12303 Name: institute_type, dtype: int64
# replace the 'institute_type' varaible and covert to integer value
data['institute_type']=data['institute_type'].str.replace('IIT','1')
data['institute_type']=data['institute_type'].str.replace('NIT','0')
data['institute_type']=data['institute_type'].astype(int)
# display the institute_type variables count
data['institute_type'].value_counts()
1 13155 0 12303 Name: institute_type, dtype: int64
# display the pool variables count
data['pool'].value_counts()
Gender-Neutral 16005 Female-Only 9453 Name: pool, dtype: int64
# replace the 'pool' varaible and covert to integer value
data['pool']=data['pool'].str.replace('Gender-Neutral','1')
data['pool']=data['pool'].str.replace('Female-Only','0')
data['pool']=data['pool'].astype(int)
# display the pool variables count
data['pool'].value_counts()
1 16005 0 9453 Name: pool, dtype: int64
# display the program_duration variables count
data['program_duration'].value_counts()
4 Years 21104 5 Years 4354 Name: program_duration, dtype: int64
# replace the 'program_duration' varaible and covert to integer value
data['program_duration']=data['program_duration'].str.replace('4 Years','1')
data['program_duration']=data['program_duration'].str.replace('5 Years','0')
data['program_duration']=data['program_duration'].astype(int)
# display the program_duration variables count
data['program_duration'].value_counts()
1 21104 0 4354 Name: program_duration, dtype: int64
# display the quota variables count
data['quota'].value_counts()
AI 13155 OS 6502 HS 5486 JK 128 GO 95 AP 72 LA 20 Name: quota, dtype: int64
# use the labelEncoder to handle categorical data
from sklearn.preprocessing import LabelEncoder
LE= LabelEncoder()
data['quota']=LE.fit_transform(data[['quota']])
# display the institute_short variables count
data['institute_short'].value_counts()
IIT-Kharagpur 2120 IIT-(BHU) Varanasi 1088 NIT-Rourkela 1054 IIT-Bombay 1034 IIT-Delhi 1018 IIT-Roorkee 989 IIT-Madras 949 IIT-Kanpur 844 NIT-Raipur 748 IIT-(ISM) Dhanbad 739 NIT-Calicut 695 NIT-Hamirpur 686 NIT-Jalandhar 649 NIT-Karnataka-Surathkal 632 NIT-Bhopal 624 NIT-Durgapur 605 NIT-Allahabad 605 IIT-Bhubaneswar 582 NIT-Agartala 561 NIT-Jaipur 549 IIT-Guwahati 531 IIT-Hyderabad 484 NIT-Kurukshetra 477 NIT-Patna 468 NIT-Jamshedpur 460 NIT-Srinagar 439 NIT-Silchar 409 IIT-Ropar 311 NIT-Warangal 306 NIT-Tiruchirappalli 303 IIT-Patna 290 IIT-Mandi 275 IIT-Gandhinagar 272 IIT-Jodhpur 265 NIT-Goa 264 IIT-Indore 244 IIT-Jammu 240 NIT-Puducherry 238 IIT-Tirupati 230 IIT-Palakkad 190 NIT-Arunachal-Pradesh 188 NIT-Manipur 188 NIT-Meghalaya 175 NIT-Delhi 162 IIT-Goa 161 NIT-Nagaland 160 IIT-Bhilai 155 NIT-Mizoram 153 NIT-Sikkim 147 IIT-Dharwad 144 NIT-Uttarakhand 99 NIT-Surat 93 NIT-Nagpur 92 NIT-Andhra-Pradesh 74 Name: institute_short, dtype: int64
# use the labelEncoder to handle categorical data
from sklearn.preprocessing import LabelEncoder
LE= LabelEncoder()
data['institute_short']=LE.fit_transform(data[['institute_short']])
# display the program_name variables count
data['program_name'].value_counts()
Computer Science and Engineering 3330
Mechanical Engineering 2774
Civil Engineering 2566
Electrical Engineering 2279
Electronics and Communication Engineering 1869
...
Manufacturing Science and Engineering with M.Tech. in Industrial andSystems Engineering and Management 7
Industrial and Systems Engineering with M.Tech. in Industrial and SystemsEngineering and Management 7
Agricultural and Food Engineering with M.Tech. in any of the listedspecializations 7
Engineering Physics and M.Tech. with specialization in Nano Science 5
Civil Engineering with M.Tech. in Structural Engineering 4
Name: program_name, Length: 130, dtype: int64
# use the labelEncoder to handle categorical data
from sklearn.preprocessing import LabelEncoder
LE= LabelEncoder()
data['program_name']=LE.fit_transform(data[['program_name']])
# display the degree_short variables count
data['degree_short'].value_counts()
B.Tech 20456 B.Tech + M.Tech (IDD) 2560 BSc 590 B.Arch 538 Int MSc. 298 Btech + M.Tech (IDD) 293 Int M.Tech 249 Int Msc. 233 BS + MS (IDD) 110 BSc + MSc (IDD) 69 B.Plan 54 B.Pharm 4 B.Pharm + M.Pharm 4 Name: degree_short, dtype: int64
# use the labelEncoder to handle categorical data
from sklearn.preprocessing import LabelEncoder
LE= LabelEncoder()
data['degree_short']=LE.fit_transform(data[['degree_short']])
# display the category variables count
data['category'].value_counts()
GEN 5252 OBC-NCL 4986 SC 4908 ST 4327 GEN-EWS 3205 GEN-PWD 1565 OBC-NCL-PWD 770 GEN-EWS-PWD 185 SC-PWD 182 ST-PWD 78 Name: category, dtype: int64
# use the labelEncoder to handle categorical data
from sklearn.preprocessing import LabelEncoder
LE= LabelEncoder()
data['category']=LE.fit_transform(data[['category']])
# Display the dataset information
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 25458 entries, 0 to 25457 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 25458 non-null int64 1 year 25458 non-null int64 2 institute_type 25458 non-null int32 3 round_no 25458 non-null int64 4 quota 25458 non-null int64 5 pool 25458 non-null int32 6 institute_short 25458 non-null int32 7 program_name 25458 non-null int32 8 program_duration 25458 non-null int32 9 degree_short 25458 non-null int32 10 category 25458 non-null int32 11 opening_rank 25458 non-null int64 12 closing_rank 25458 non-null int64 13 is_preparatory 25458 non-null int64 dtypes: int32(7), int64(7) memory usage: 2.0 MB
# display the sample sataset
data.sample(5)
| id | year | institute_type | round_no | quota | pool | institute_short | program_name | program_duration | degree_short | category | opening_rank | closing_rank | is_preparatory | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 11145 | 11146 | 2021 | 1 | 1 | 0 | 0 | 21 | 47 | 1 | 4 | 8 | 406 | 406 | 1 |
| 19082 | 23791 | 2020 | 0 | 6 | 6 | 1 | 30 | 110 | 1 | 4 | 0 | 36295 | 41685 | 0 |
| 6465 | 6466 | 2019 | 1 | 7 | 0 | 1 | 12 | 28 | 1 | 4 | 6 | 2274 | 2365 | 0 |
| 24895 | 30577 | 2021 | 0 | 1 | 3 | 0 | 46 | 26 | 1 | 4 | 6 | 7792 | 7792 | 0 |
| 24127 | 29810 | 2021 | 0 | 1 | 6 | 0 | 45 | 47 | 1 | 4 | 0 | 23670 | 26634 | 0 |
# Count the target or dependent variable by '0' & '1' and their proportion
# (>= 10 : 1, then the dataset is imbalance data)
is_preparatory_count = data.is_preparatory.value_counts()
print('Class 0:', is_preparatory_count[0])
print('Class 1:', is_preparatory_count[1])
print('Proportion:', round(is_preparatory_count[0] / is_preparatory_count[1], 2), ': 1')
print('Total IIT-NIT Data records:', len(data))
Class 0: 24549 Class 1: 909 Proportion: 27.01 : 1 Total IIT-NIT Data records: 25458
# Identify the independent and target (dependent) variables
Indepvar=[]
for col in data.columns:
if col != 'is_preparatory':
Indepvar.append(col)
TargetVar='is_preparatory'
x=data[Indepvar]
y=data[TargetVar]
# Random oversampling can be implemented using the RandomOverSampler class
from imblearn.over_sampling import RandomOverSampler
oversample = RandomOverSampler(sampling_strategy=0.125)
x_over, y_over = oversample.fit_resample(x, y)
print(x_over.shape)
print(y_over.shape)
(27617, 13) (27617,)
# Random oversampling can be implemented using the RandomOverSampler class
from imblearn.over_sampling import RandomOverSampler
oversample = RandomOverSampler(sampling_strategy=0.125)
x_over, y_over = oversample.fit_resample(x, y)
print(x_over.shape)
print(y_over.shape)
(27617, 13) (27617,)
# split the data into train and test (random sampling)
#70% data train and 30% data test
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)
# display yhe shape for train and test data
x_train.shape,x_test.shape,y_train.shape,y_test.shape
((17820, 13), (7638, 13), (17820,), (7638,))
# Scaling the features by using MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
mmscaler = MinMaxScaler(feature_range=(0, 1))
x_train= mmscaler.fit_transform(x_train)
x_train = pd.DataFrame(x_train)
x_test = mmscaler.fit_transform(x_test)
x_test = pd.DataFrame(x_test)
# To build the 'Logistic Regression' model with random sampling
from sklearn.linear_model import LogisticRegression
# create an object for model
ModelLR= LogisticRegression()
# train the model
ModelLR.fit(x_train,y_train)
# predict themodel with test the dataset
y_pred=ModelLR.predict(x_test)
y_pred_prob=ModelLR.predict_proba(x_test)
#To display the algorithm paramaters
params=ModelLR.get_params()
print(params)
{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
# Confusion matrix in sklearn
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# Actual values
actual = y_test
# Predicted values
predicted = y_pred
# Confusion matrix
matrix = confusion_matrix(actual,predicted, labels=[1,0], sample_weight=None, normalize=None)
print('Confusion matrix : \n', matrix)
# Outcome values order in sklearn
tp, fn, fp, tn = confusion_matrix(actual,predicted,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)
# Classification report for precision, recall f1-score and accuracy
C_Report = classification_report(actual,predicted,labels=[1,0])
print('Classification report : \n', C_Report)
# Calculating the metrics
sensitivity = round(tp/(tp+fn), 3);
specificity = round(tn/(tn+fp), 3);
accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
balanced_accuracy = round((sensitivity+specificity)/2, 3);
precision = round(tp/(tp+fp), 3);
f1Score = round((2*tp/(2*tp + fp + fn)), 3);
# Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1.
# A model with a score of +1 is a perfect model and -1 is a poor model
from math import sqrt
mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)
print('Accuracy :', round(accuracy*100, 2),'%')
print('Precision :', round(precision*100, 2),'%')
print('Recall :', round(sensitivity*100,2), '%')
print('F1 Score :', f1Score)
print('Specificity or True Negative Rate :', round(specificity*100,2), '%')
print('Balanced Accuracy :', round(balanced_accuracy*100, 2),'%')
print('MCC :', MCC)
# Area under ROC curve
from sklearn.metrics import roc_curve, roc_auc_score
print('roc_auc_score:', round(roc_auc_score(actual, predicted), 3))
# ROC Curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
model_roc_auc = roc_auc_score(actual, predicted)
fpr, tpr, thresholds = roc_curve(actual, ModelLR.predict_proba(x_test)[:,1])
plt.figure()
#----------------------------------------------------
plt.plot(fpr, tpr, label= 'Classification Model' % model_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
print('-----------------------------------------------------------------------------------------------------')
Confusion matrix :
[[ 0 262]
[ 0 7376]]
Outcome values :
0 262 0 7376
Classification report :
precision recall f1-score support
1 0.00 0.00 0.00 262
0 0.97 1.00 0.98 7376
accuracy 0.97 7638
macro avg 0.48 0.50 0.49 7638
weighted avg 0.93 0.97 0.95 7638
Accuracy : 96.6 %
Precision : nan %
Recall : 0.0 %
F1 Score : 0.0
Specificity or True Negative Rate : 100.0 %
Balanced Accuracy : 50.0 %
MCC : nan
roc_auc_score: 0.5
-----------------------------------------------------------------------------------------------------
# To build the 'Decision tree algorithm' model with random sampling
from sklearn.tree import DecisionTreeClassifier
# create an object for model
ModelDT=DecisionTreeClassifier()
# train the model
ModelDT.fit(x_train,y_train)
# predict themodel with test the dataset
y_pred=ModelDT.predict(x_test)
y_pred_prob=ModelDT.predict_proba(x_test)
#To display the algorithm paramaters
params=ModelDT.get_params()
print(params)
{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'best'}
# Confusion matrix in sklearn
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# Actual values
actual = y_test
# Predicted values
predicted = y_pred
# Confusion matrix
matrix = confusion_matrix(actual,predicted, labels=[1,0], sample_weight=None, normalize=None)
print('Confusion matrix : \n', matrix)
# Outcome values order in sklearn
tp, fn, fp, tn = confusion_matrix(actual,predicted,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)
# Classification report for precision, recall f1-score and accuracy
C_Report = classification_report(actual,predicted,labels=[1,0])
print('Classification report : \n', C_Report)
# Calculating the metrics
sensitivity = round(tp/(tp+fn), 3);
specificity = round(tn/(tn+fp), 3);
accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
balanced_accuracy = round((sensitivity+specificity)/2, 3);
precision = round(tp/(tp+fp), 3);
f1Score = round((2*tp/(2*tp + fp + fn)), 3);
# Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1.
# A model with a score of +1 is a perfect model and -1 is a poor model
from math import sqrt
mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)
print('Accuracy :', round(accuracy*100, 2),'%')
print('Precision :', round(precision*100, 2),'%')
print('Recall :', round(sensitivity*100,2), '%')
print('F1 Score :', f1Score)
print('Specificity or True Negative Rate :', round(specificity*100,2), '%')
print('Balanced Accuracy :', round(balanced_accuracy*100, 2),'%')
print('MCC :', MCC)
# Area under ROC curve
from sklearn.metrics import roc_curve, roc_auc_score
print('roc_auc_score:', round(roc_auc_score(actual, predicted), 3))
# ROC Curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
model_roc_auc = roc_auc_score(actual, predicted)
fpr, tpr, thresholds = roc_curve(actual, ModelDT.predict_proba(x_test)[:,1])
plt.figure()
#----------------------------------------------------
plt.plot(fpr, tpr, label= 'Classification Model' % model_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
print('-----------------------------------------------------------------------------------------------------')
Confusion matrix :
[[ 170 92]
[ 218 7158]]
Outcome values :
170 92 218 7158
Classification report :
precision recall f1-score support
1 0.44 0.65 0.52 262
0 0.99 0.97 0.98 7376
accuracy 0.96 7638
macro avg 0.71 0.81 0.75 7638
weighted avg 0.97 0.96 0.96 7638
Accuracy : 95.9 %
Precision : 43.8 %
Recall : 64.9 %
F1 Score : 0.523
Specificity or True Negative Rate : 97.0 %
Balanced Accuracy : 81.0 %
MCC : 0.513
roc_auc_score: 0.81
-----------------------------------------------------------------------------------------------------
# plot the decision tree
import matplotlib.pyplot as plt
from sklearn import tree
plt.figure(figsize=(20,5))
tree.plot_tree(ModelDT);
# To build the 'Random forest algorithm' model with random sampling
from sklearn.ensemble import RandomForestClassifier
# create an object for model
ModelRF= RandomForestClassifier()
# train the model
ModelRF.fit(x_train,y_train)
# predict themodel with test the dataset
y_pred=ModelRF.predict(x_test)
y_pred_prob=ModelRF.predict_proba(x_test)
#To display the algorithm paramaters
params=ModelRF.get_params()
print(params)
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
# Confusion matrix in sklearn
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# Actual values
actual = y_test
# Predicted values
predicted = y_pred
# Confusion matrix
matrix = confusion_matrix(actual,predicted, labels=[1,0], sample_weight=None, normalize=None)
print('Confusion matrix : \n', matrix)
# Outcome values order in sklearn
tp, fn, fp, tn = confusion_matrix(actual,predicted,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)
# Classification report for precision, recall f1-score and accuracy
C_Report = classification_report(actual,predicted,labels=[1,0])
print('Classification report : \n', C_Report)
# Calculating the metrics
sensitivity = round(tp/(tp+fn), 3);
specificity = round(tn/(tn+fp), 3);
accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
balanced_accuracy = round((sensitivity+specificity)/2, 3);
precision = round(tp/(tp+fp), 3);
f1Score = round((2*tp/(2*tp + fp + fn)), 3);
# Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1.
# A model with a score of +1 is a perfect model and -1 is a poor model
from math import sqrt
mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)
print('Accuracy :', round(accuracy*100, 2),'%')
print('Precision :', round(precision*100, 2),'%')
print('Recall :', round(sensitivity*100,2), '%')
print('F1 Score :', f1Score)
print('Specificity or True Negative Rate :', round(specificity*100,2), '%')
print('Balanced Accuracy :', round(balanced_accuracy*100, 2),'%')
print('MCC :', MCC)
# Area under ROC curve
from sklearn.metrics import roc_curve, roc_auc_score
print('roc_auc_score:', round(roc_auc_score(actual, predicted), 3))
# ROC Curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
model_roc_auc = roc_auc_score(actual, predicted)
fpr, tpr, thresholds = roc_curve(actual, ModelRF.predict_proba(x_test)[:,1])
plt.figure()
#----------------------------------------------------
plt.plot(fpr, tpr, label= 'Classification Model' % model_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
print('-----------------------------------------------------------------------------------------------------')
Confusion matrix :
[[ 143 119]
[ 37 7339]]
Outcome values :
143 119 37 7339
Classification report :
precision recall f1-score support
1 0.79 0.55 0.65 262
0 0.98 0.99 0.99 7376
accuracy 0.98 7638
macro avg 0.89 0.77 0.82 7638
weighted avg 0.98 0.98 0.98 7638
Accuracy : 98.0 %
Precision : 79.4 %
Recall : 54.6 %
F1 Score : 0.647
Specificity or True Negative Rate : 99.5 %
Balanced Accuracy : 77.0 %
MCC : 0.649
roc_auc_score: 0.77
-----------------------------------------------------------------------------------------------------
# To build the 'Random Forest' model with random sampling
from sklearn.ensemble import ExtraTreesClassifier
# Create an object for Extra Trees Classifier
ModelET = ExtraTreesClassifier()
# Train the model with train data
ModelET.fit(x_train,y_train)
# Predict the model with test data set
y_pred = ModelET.predict(x_test)
y_pred_prob = ModelET.predict_proba(x_test)
# Confusion matrix in sklearn
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# actual values
actual = y_test
# predicted values
predicted = y_pred
# confusion matrix
matrix = confusion_matrix(actual, predicted, labels=[1,0],sample_weight=None, normalize=None)
print('Confusion matrix : \n', matrix)
# outcome values order in sklearn
tp, fn, fp, tn = confusion_matrix(actual, predicted, labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)
# classification report for precision, recall f1-score and accuracy
C_Report = classification_report(actual, predicted, labels=[1,0])
print('Classification report : \n', C_Report)
# calculating the metrics
sensitivity = round(tp/(tp+fn), 3);
specificity = round(tn/(tn+fp), 3);
accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
balanced_accuracy = round((sensitivity+specificity)/2, 3);
precision = round(tp/(tp+fp), 3);
f1Score = round((2*tp/(2*tp + fp + fn)), 3);
# Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1.
# A model with a score of +1 is a perfect model and -1 is a poor model
from math import sqrt
mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)
print('Accuracy :', round(accuracy*100, 2),'%')
print('Precision :', round(precision*100, 2),'%')
print('Recall :', round(sensitivity*100,2), '%')
print('F1 Score :', f1Score)
print('Specificity or True Negative Rate :', round(specificity*100,2), '%' )
print('Balanced Accuracy :', round(balanced_accuracy*100, 2),'%')
print('MCC :', MCC)
# Area under ROC curve
from sklearn.metrics import roc_curve, roc_auc_score
print('roc_auc_score:', round(roc_auc_score(actual, predicted), 3))
# ROC Curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
model_roc_auc = roc_auc_score(actual, predicted)
fpr, tpr, thresholds = roc_curve(actual, ModelET.predict_proba(x_test)[:,1])
plt.figure()
#--------------------------------------------------------------------
plt.plot(fpr, tpr, label= 'Classification Model' % model_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
print('-----------------------------------------------------------------------------------------------------')
Confusion matrix :
[[ 117 145]
[ 50 7326]]
Outcome values :
117 145 50 7326
Classification report :
precision recall f1-score support
1 0.70 0.45 0.55 262
0 0.98 0.99 0.99 7376
accuracy 0.97 7638
macro avg 0.84 0.72 0.77 7638
weighted avg 0.97 0.97 0.97 7638
Accuracy : 97.4 %
Precision : 70.1 %
Recall : 44.7 %
F1 Score : 0.545
Specificity or True Negative Rate : 99.3 %
Balanced Accuracy : 72.0 %
MCC : 0.547
roc_auc_score: 0.72
-----------------------------------------------------------------------------------------------------
# display the all the variables
data.columns
Index(['id', 'year', 'institute_type', 'round_no', 'quota', 'pool',
'institute_short', 'program_name', 'program_duration', 'degree_short',
'category', 'opening_rank', 'closing_rank', 'is_preparatory'],
dtype='object')
# Create a list for plotting the decision trees
figcols = ['id', 'year', 'institute_type', 'round_no', 'quota', 'pool',
'institute_short', 'program_name', 'program_duration', 'degree_short',
'category', 'opening_rank', 'closing_rank', 'is_preparatory']
# Visualize individual trees and code below visualizes the first decision tree of Extra Trees Classifier
from sklearn import tree
fn1=figcols
cn1=['0', '1']
fig, axes = plt.subplots(nrows = 1, ncols = 1, figsize = (4,4), dpi=800)
tree.plot_tree(ModelET.estimators_[0],
feature_names = fn1,
class_names=cn1,
filled = True);
#fig.savefig('ModelET.png')
# Visualize individual trees and code below visualizes the first 5 decision trees of Extra Trees Classifier
from sklearn import tree
fn2=figcols
cn2=['0', '1']
fig, axes = plt.subplots(nrows = 1, ncols = 5, figsize = (10,2), dpi=3000)
for index in range(0, 5):
tree.plot_tree(ModelET.estimators_[index],
feature_names = fn2,
class_names=cn2,
filled = True,
ax = axes[index]);
axes[index].set_title('Estimator: ' + str(index), fontsize = 11)
#fig.savefig('ModelET1.png')
# load the KNNResults
KNNResults = pd.read_excel(r"KNN_ResultsNew.xlsx", header=0)
KNNResults.head()
| Model Name | KNN K Value | True_Positive | False_Negative | False_Positive | True_Negative | Accuracy | Precision | Recall | F1 Score | Specificity | MCC | ROC_AUC_Score | Balanced Accuracy |
|---|
# Build KNN Model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import sklearn.metrics as metrics
from sklearn.metrics import roc_curve, roc_auc_score
accuracy = []
for a in range(1, 21, 1):
k = a
# Build the model
ModelKNN = KNeighborsClassifier(n_neighbors=k)
# Train the model
ModelKNN.fit(x_train, y_train)
# Predict the model
y_pred = ModelKNN.predict(x_test)
y_pred_prob = ModelKNN.predict_proba(x_test)
print('KNN_K_value = ', a)
# Print the model name
print('Model Name: ', ModelKNN)
# confusion matrix in sklearn
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# actual values
actual = y_test
# predicted values
predicted = y_pred
# confusion matrix
matrix = confusion_matrix(actual,predicted, labels=[1,0],sample_weight=None, normalize=None)
print('Confusion matrix : \n', matrix)
# outcome values order in sklearn
tp, fn, fp, tn = confusion_matrix(actual,predicted,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)
# classification report for precision, recall f1-score and accuracy
C_Report = classification_report(actual,predicted,labels=[1,0])
print('Classification report : \n', C_Report)
# calculating the metrics
sensitivity = round(tp/(tp+fn), 3);
specificity = round(tn/(tn+fp), 3);
accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
balanced_accuracy = round((sensitivity+specificity)/2, 3);
precision = round(tp/(tp+fp), 3);
f1Score = round((2*tp/(2*tp + fp + fn)), 3);
# Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1.
# A model with a score of +1 is a perfect model and -1 is a poor model
from math import sqrt
mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)
print('Accuracy :', round(accuracy*100, 2),'%')
print('Precision :', round(precision*100, 2),'%')
print('Recall :', round(sensitivity*100,2), '%')
print('F1 Score :', f1Score)
print('Specificity or True Negative Rate :', round(specificity*100,2), '%' )
print('Balanced Accuracy :', round(balanced_accuracy*100, 2),'%')
print('MCC :', MCC)
# Area under ROC curve
from sklearn.metrics import roc_curve, roc_auc_score
print('roc_auc_score:', round(roc_auc_score(actual, predicted), 3))
# ROC Curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
model_roc_auc = roc_auc_score(actual, predicted)
fpr, tpr, thresholds = roc_curve(actual, ModelKNN.predict_proba(x_test)[:,1])
plt.figure()
# plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot(fpr, tpr, label= 'Classification Model' % model_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
#plt.savefig('Log_ROC')
plt.show()
#------------------------------------------------------------------------------
new_row = {'Model Name' : ModelKNN,
'KNN K Value' : a,
'True_Positive' : tp,
'False_Negative' : fn,
'False_Positive' : fp,
'True_Negative' : tn,
'Accuracy' : accuracy,
'Precision' : precision,
'Recall' : sensitivity,
'F1 Score' : f1Score,
'Specificity' : specificity,
'MCC':MCC,
'ROC_AUC_Score':roc_auc_score(actual, predicted),
'Balanced Accuracy':balanced_accuracy}
KNNResults = KNNResults.append(new_row, ignore_index=True)
#------KNN_Results------------------------------------------------------------------------
KNN_K_value = 1
Model Name: KNeighborsClassifier(n_neighbors=1)
Confusion matrix :
[[ 140 122]
[ 102 7274]]
Outcome values :
140 122 102 7274
Classification report :
precision recall f1-score support
1 0.58 0.53 0.56 262
0 0.98 0.99 0.98 7376
accuracy 0.97 7638
macro avg 0.78 0.76 0.77 7638
weighted avg 0.97 0.97 0.97 7638
Accuracy : 97.1 %
Precision : 57.9 %
Recall : 53.4 %
F1 Score : 0.556
Specificity or True Negative Rate : 98.6 %
Balanced Accuracy : 76.0 %
MCC : 0.541
roc_auc_score: 0.76
KNN_K_value = 2
Model Name: KNeighborsClassifier(n_neighbors=2)
Confusion matrix :
[[ 63 199]
[ 43 7333]]
Outcome values :
63 199 43 7333
Classification report :
precision recall f1-score support
1 0.59 0.24 0.34 262
0 0.97 0.99 0.98 7376
accuracy 0.97 7638
macro avg 0.78 0.62 0.66 7638
weighted avg 0.96 0.97 0.96 7638
Accuracy : 96.8 %
Precision : 59.4 %
Recall : 24.0 %
F1 Score : 0.342
Specificity or True Negative Rate : 99.4 %
Balanced Accuracy : 61.7 %
MCC : 0.365
roc_auc_score: 0.617
KNN_K_value = 3
Model Name: KNeighborsClassifier(n_neighbors=3)
Confusion matrix :
[[ 106 156]
[ 77 7299]]
Outcome values :
106 156 77 7299
Classification report :
precision recall f1-score support
1 0.58 0.40 0.48 262
0 0.98 0.99 0.98 7376
accuracy 0.97 7638
macro avg 0.78 0.70 0.73 7638
weighted avg 0.97 0.97 0.97 7638
Accuracy : 96.9 %
Precision : 57.9 %
Recall : 40.5 %
F1 Score : 0.476
Specificity or True Negative Rate : 99.0 %
Balanced Accuracy : 69.8 %
MCC : 0.469
roc_auc_score: 0.697
KNN_K_value = 4
Model Name: KNeighborsClassifier(n_neighbors=4)
Confusion matrix :
[[ 62 200]
[ 35 7341]]
Outcome values :
62 200 35 7341
Classification report :
precision recall f1-score support
1 0.64 0.24 0.35 262
0 0.97 1.00 0.98 7376
accuracy 0.97 7638
macro avg 0.81 0.62 0.66 7638
weighted avg 0.96 0.97 0.96 7638
Accuracy : 96.9 %
Precision : 63.9 %
Recall : 23.7 %
F1 Score : 0.345
Specificity or True Negative Rate : 99.5 %
Balanced Accuracy : 61.6 %
MCC : 0.377
roc_auc_score: 0.616
KNN_K_value = 5
Model Name: KNeighborsClassifier()
Confusion matrix :
[[ 83 179]
[ 62 7314]]
Outcome values :
83 179 62 7314
Classification report :
precision recall f1-score support
1 0.57 0.32 0.41 262
0 0.98 0.99 0.98 7376
accuracy 0.97 7638
macro avg 0.77 0.65 0.70 7638
weighted avg 0.96 0.97 0.96 7638
Accuracy : 96.8 %
Precision : 57.2 %
Recall : 31.7 %
F1 Score : 0.408
Specificity or True Negative Rate : 99.2 %
Balanced Accuracy : 65.4 %
MCC : 0.411
roc_auc_score: 0.654
KNN_K_value = 6
Model Name: KNeighborsClassifier(n_neighbors=6)
Confusion matrix :
[[ 58 204]
[ 25 7351]]
Outcome values :
58 204 25 7351
Classification report :
precision recall f1-score support
1 0.70 0.22 0.34 262
0 0.97 1.00 0.98 7376
accuracy 0.97 7638
macro avg 0.84 0.61 0.66 7638
weighted avg 0.96 0.97 0.96 7638
Accuracy : 97.0 %
Precision : 69.9 %
Recall : 22.1 %
F1 Score : 0.336
Specificity or True Negative Rate : 99.7 %
Balanced Accuracy : 60.9 %
MCC : 0.383
roc_auc_score: 0.609
KNN_K_value = 7
Model Name: KNeighborsClassifier(n_neighbors=7)
Confusion matrix :
[[ 77 185]
[ 41 7335]]
Outcome values :
77 185 41 7335
Classification report :
precision recall f1-score support
1 0.65 0.29 0.41 262
0 0.98 0.99 0.98 7376
accuracy 0.97 7638
macro avg 0.81 0.64 0.70 7638
weighted avg 0.96 0.97 0.96 7638
Accuracy : 97.0 %
Precision : 65.3 %
Recall : 29.4 %
F1 Score : 0.405
Specificity or True Negative Rate : 99.4 %
Balanced Accuracy : 64.4 %
MCC : 0.426
roc_auc_score: 0.644
KNN_K_value = 8
Model Name: KNeighborsClassifier(n_neighbors=8)
Confusion matrix :
[[ 50 212]
[ 22 7354]]
Outcome values :
50 212 22 7354
Classification report :
precision recall f1-score support
1 0.69 0.19 0.30 262
0 0.97 1.00 0.98 7376
accuracy 0.97 7638
macro avg 0.83 0.59 0.64 7638
weighted avg 0.96 0.97 0.96 7638
Accuracy : 96.9 %
Precision : 69.4 %
Recall : 19.1 %
F1 Score : 0.299
Specificity or True Negative Rate : 99.7 %
Balanced Accuracy : 59.4 %
MCC : 0.354
roc_auc_score: 0.594
KNN_K_value = 9
Model Name: KNeighborsClassifier(n_neighbors=9)
Confusion matrix :
[[ 58 204]
[ 31 7345]]
Outcome values :
58 204 31 7345
Classification report :
precision recall f1-score support
1 0.65 0.22 0.33 262
0 0.97 1.00 0.98 7376
accuracy 0.97 7638
macro avg 0.81 0.61 0.66 7638
weighted avg 0.96 0.97 0.96 7638
Accuracy : 96.9 %
Precision : 65.2 %
Recall : 22.1 %
F1 Score : 0.33
Specificity or True Negative Rate : 99.6 %
Balanced Accuracy : 60.8 %
MCC : 0.368
roc_auc_score: 0.609
KNN_K_value = 10
Model Name: KNeighborsClassifier(n_neighbors=10)
Confusion matrix :
[[ 48 214]
[ 16 7360]]
Outcome values :
48 214 16 7360
Classification report :
precision recall f1-score support
1 0.75 0.18 0.29 262
0 0.97 1.00 0.98 7376
accuracy 0.97 7638
macro avg 0.86 0.59 0.64 7638
weighted avg 0.96 0.97 0.96 7638
Accuracy : 97.0 %
Precision : 75.0 %
Recall : 18.3 %
F1 Score : 0.294
Specificity or True Negative Rate : 99.8 %
Balanced Accuracy : 59.0 %
MCC : 0.361
roc_auc_score: 0.591
KNN_K_value = 11
Model Name: KNeighborsClassifier(n_neighbors=11)
Confusion matrix :
[[ 55 207]
[ 24 7352]]
Outcome values :
55 207 24 7352
Classification report :
precision recall f1-score support
1 0.70 0.21 0.32 262
0 0.97 1.00 0.98 7376
accuracy 0.97 7638
macro avg 0.83 0.60 0.65 7638
weighted avg 0.96 0.97 0.96 7638
Accuracy : 97.0 %
Precision : 69.6 %
Recall : 21.0 %
F1 Score : 0.323
Specificity or True Negative Rate : 99.7 %
Balanced Accuracy : 60.4 %
MCC : 0.372
roc_auc_score: 0.603
KNN_K_value = 12
Model Name: KNeighborsClassifier(n_neighbors=12)
Confusion matrix :
[[ 45 217]
[ 18 7358]]
Outcome values :
45 217 18 7358
Classification report :
precision recall f1-score support
1 0.71 0.17 0.28 262
0 0.97 1.00 0.98 7376
accuracy 0.97 7638
macro avg 0.84 0.58 0.63 7638
weighted avg 0.96 0.97 0.96 7638
Accuracy : 96.9 %
Precision : 71.4 %
Recall : 17.2 %
F1 Score : 0.277
Specificity or True Negative Rate : 99.8 %
Balanced Accuracy : 58.5 %
MCC : 0.341
roc_auc_score: 0.585
KNN_K_value = 13
Model Name: KNeighborsClassifier(n_neighbors=13)
Confusion matrix :
[[ 50 212]
[ 21 7355]]
Outcome values :
50 212 21 7355
Classification report :
precision recall f1-score support
1 0.70 0.19 0.30 262
0 0.97 1.00 0.98 7376
accuracy 0.97 7638
macro avg 0.84 0.59 0.64 7638
weighted avg 0.96 0.97 0.96 7638
Accuracy : 96.9 %
Precision : 70.4 %
Recall : 19.1 %
F1 Score : 0.3
Specificity or True Negative Rate : 99.7 %
Balanced Accuracy : 59.4 %
MCC : 0.357
roc_auc_score: 0.594
KNN_K_value = 14
Model Name: KNeighborsClassifier(n_neighbors=14)
Confusion matrix :
[[ 41 221]
[ 13 7363]]
Outcome values :
41 221 13 7363
Classification report :
precision recall f1-score support
1 0.76 0.16 0.26 262
0 0.97 1.00 0.98 7376
accuracy 0.97 7638
macro avg 0.87 0.58 0.62 7638
weighted avg 0.96 0.97 0.96 7638
Accuracy : 96.9 %
Precision : 75.9 %
Recall : 15.6 %
F1 Score : 0.259
Specificity or True Negative Rate : 99.8 %
Balanced Accuracy : 57.7 %
MCC : 0.336
roc_auc_score: 0.577
KNN_K_value = 15
Model Name: KNeighborsClassifier(n_neighbors=15)
Confusion matrix :
[[ 48 214]
[ 18 7358]]
Outcome values :
48 214 18 7358
Classification report :
precision recall f1-score support
1 0.73 0.18 0.29 262
0 0.97 1.00 0.98 7376
accuracy 0.97 7638
macro avg 0.85 0.59 0.64 7638
weighted avg 0.96 0.97 0.96 7638
Accuracy : 97.0 %
Precision : 72.7 %
Recall : 18.3 %
F1 Score : 0.293
Specificity or True Negative Rate : 99.8 %
Balanced Accuracy : 59.0 %
MCC : 0.355
roc_auc_score: 0.59
KNN_K_value = 16
Model Name: KNeighborsClassifier(n_neighbors=16)
Confusion matrix :
[[ 40 222]
[ 12 7364]]
Outcome values :
40 222 12 7364
Classification report :
precision recall f1-score support
1 0.77 0.15 0.25 262
0 0.97 1.00 0.98 7376
accuracy 0.97 7638
macro avg 0.87 0.58 0.62 7638
weighted avg 0.96 0.97 0.96 7638
Accuracy : 96.9 %
Precision : 76.9 %
Recall : 15.3 %
F1 Score : 0.255
Specificity or True Negative Rate : 99.8 %
Balanced Accuracy : 57.6 %
MCC : 0.334
roc_auc_score: 0.576
KNN_K_value = 17
Model Name: KNeighborsClassifier(n_neighbors=17)
Confusion matrix :
[[ 45 217]
[ 13 7363]]
Outcome values :
45 217 13 7363
Classification report :
precision recall f1-score support
1 0.78 0.17 0.28 262
0 0.97 1.00 0.98 7376
accuracy 0.97 7638
macro avg 0.87 0.58 0.63 7638
weighted avg 0.96 0.97 0.96 7638
Accuracy : 97.0 %
Precision : 77.6 %
Recall : 17.2 %
F1 Score : 0.281
Specificity or True Negative Rate : 99.8 %
Balanced Accuracy : 58.5 %
MCC : 0.356
roc_auc_score: 0.585
KNN_K_value = 18
Model Name: KNeighborsClassifier(n_neighbors=18)
Confusion matrix :
[[ 38 224]
[ 12 7364]]
Outcome values :
38 224 12 7364
Classification report :
precision recall f1-score support
1 0.76 0.15 0.24 262
0 0.97 1.00 0.98 7376
accuracy 0.97 7638
macro avg 0.87 0.57 0.61 7638
weighted avg 0.96 0.97 0.96 7638
Accuracy : 96.9 %
Precision : 76.0 %
Recall : 14.5 %
F1 Score : 0.244
Specificity or True Negative Rate : 99.8 %
Balanced Accuracy : 57.2 %
MCC : 0.324
roc_auc_score: 0.572
KNN_K_value = 19
Model Name: KNeighborsClassifier(n_neighbors=19)
Confusion matrix :
[[ 44 218]
[ 14 7362]]
Outcome values :
44 218 14 7362
Classification report :
precision recall f1-score support
1 0.76 0.17 0.28 262
0 0.97 1.00 0.98 7376
accuracy 0.97 7638
macro avg 0.86 0.58 0.63 7638
weighted avg 0.96 0.97 0.96 7638
Accuracy : 97.0 %
Precision : 75.9 %
Recall : 16.8 %
F1 Score : 0.275
Specificity or True Negative Rate : 99.8 %
Balanced Accuracy : 58.3 %
MCC : 0.348
roc_auc_score: 0.583
KNN_K_value = 20
Model Name: KNeighborsClassifier(n_neighbors=20)
Confusion matrix :
[[ 36 226]
[ 12 7364]]
Outcome values :
36 226 12 7364
Classification report :
precision recall f1-score support
1 0.75 0.14 0.23 262
0 0.97 1.00 0.98 7376
accuracy 0.97 7638
macro avg 0.86 0.57 0.61 7638
weighted avg 0.96 0.97 0.96 7638
Accuracy : 96.9 %
Precision : 75.0 %
Recall : 13.7 %
F1 Score : 0.232
Specificity or True Negative Rate : 99.8 %
Balanced Accuracy : 56.8 %
MCC : 0.313
roc_auc_score: 0.568
# display the KNNResults
KNNResults
| Model Name | KNN K Value | True_Positive | False_Negative | False_Positive | True_Negative | Accuracy | Precision | Recall | F1 Score | Specificity | MCC | ROC_AUC_Score | Balanced Accuracy | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | KNeighborsClassifier(n_neighbors=1) | 1 | 140 | 122 | 102 | 7274 | 0.971 | 0.579 | 0.534 | 0.556 | 0.986 | 0.541 | 0.760261 | 0.76 |
| 1 | KNeighborsClassifier(n_neighbors=2) | 2 | 63 | 199 | 43 | 7333 | 0.968 | 0.594 | 0.24 | 0.342 | 0.994 | 0.365 | 0.617314 | 0.617 |
| 2 | KNeighborsClassifier(n_neighbors=3) | 3 | 106 | 156 | 77 | 7299 | 0.969 | 0.579 | 0.405 | 0.476 | 0.99 | 0.469 | 0.69707 | 0.698 |
| 3 | KNeighborsClassifier(n_neighbors=4) | 4 | 62 | 200 | 35 | 7341 | 0.969 | 0.639 | 0.237 | 0.345 | 0.995 | 0.377 | 0.615948 | 0.616 |
| 4 | KNeighborsClassifier() | 5 | 83 | 179 | 62 | 7314 | 0.968 | 0.572 | 0.317 | 0.408 | 0.992 | 0.411 | 0.654194 | 0.654 |
| 5 | KNeighborsClassifier(n_neighbors=6) | 6 | 58 | 204 | 25 | 7351 | 0.97 | 0.699 | 0.221 | 0.336 | 0.997 | 0.383 | 0.608992 | 0.609 |
| 6 | KNeighborsClassifier(n_neighbors=7) | 7 | 77 | 185 | 41 | 7335 | 0.97 | 0.653 | 0.294 | 0.405 | 0.994 | 0.426 | 0.644167 | 0.644 |
| 7 | KNeighborsClassifier(n_neighbors=8) | 8 | 50 | 212 | 22 | 7354 | 0.969 | 0.694 | 0.191 | 0.299 | 0.997 | 0.354 | 0.593929 | 0.594 |
| 8 | KNeighborsClassifier(n_neighbors=9) | 9 | 58 | 204 | 31 | 7345 | 0.969 | 0.652 | 0.221 | 0.33 | 0.996 | 0.368 | 0.608586 | 0.608 |
| 9 | KNeighborsClassifier(n_neighbors=10) | 10 | 48 | 214 | 16 | 7360 | 0.97 | 0.75 | 0.183 | 0.294 | 0.998 | 0.361 | 0.590518 | 0.59 |
| 10 | KNeighborsClassifier(n_neighbors=11) | 11 | 55 | 207 | 24 | 7352 | 0.97 | 0.696 | 0.21 | 0.323 | 0.997 | 0.372 | 0.603335 | 0.604 |
| 11 | KNeighborsClassifier(n_neighbors=12) | 12 | 45 | 217 | 18 | 7358 | 0.969 | 0.714 | 0.172 | 0.277 | 0.998 | 0.341 | 0.584658 | 0.585 |
| 12 | KNeighborsClassifier(n_neighbors=13) | 13 | 50 | 212 | 21 | 7355 | 0.969 | 0.704 | 0.191 | 0.3 | 0.997 | 0.357 | 0.593996 | 0.594 |
| 13 | KNeighborsClassifier(n_neighbors=14) | 14 | 41 | 221 | 13 | 7363 | 0.969 | 0.759 | 0.156 | 0.259 | 0.998 | 0.336 | 0.577363 | 0.577 |
| 14 | KNeighborsClassifier(n_neighbors=15) | 15 | 48 | 214 | 18 | 7358 | 0.97 | 0.727 | 0.183 | 0.293 | 0.998 | 0.355 | 0.590383 | 0.59 |
| 15 | KNeighborsClassifier(n_neighbors=16) | 16 | 40 | 222 | 12 | 7364 | 0.969 | 0.769 | 0.153 | 0.255 | 0.998 | 0.334 | 0.575522 | 0.576 |
| 16 | KNeighborsClassifier(n_neighbors=17) | 17 | 45 | 217 | 13 | 7363 | 0.97 | 0.776 | 0.172 | 0.281 | 0.998 | 0.356 | 0.584997 | 0.585 |
| 17 | KNeighborsClassifier(n_neighbors=18) | 18 | 38 | 224 | 12 | 7364 | 0.969 | 0.76 | 0.145 | 0.244 | 0.998 | 0.324 | 0.571706 | 0.572 |
| 18 | KNeighborsClassifier(n_neighbors=19) | 19 | 44 | 218 | 14 | 7362 | 0.97 | 0.759 | 0.168 | 0.275 | 0.998 | 0.348 | 0.58302 | 0.583 |
| 19 | KNeighborsClassifier(n_neighbors=20) | 20 | 36 | 226 | 12 | 7364 | 0.969 | 0.75 | 0.137 | 0.232 | 0.998 | 0.313 | 0.567889 | 0.568 |
# Training the Naive Bayes model (GaussianNB) on the Training set
from sklearn.naive_bayes import GaussianNB
modelGNB = GaussianNB(priors=None, var_smoothing=1e-09)
# Fit the model with train data
modelGNB.fit(x_train,y_train)
# Predict the model with test data set
y_pred = modelGNB.predict(x_test)
y_pred_prob = modelGNB.predict_proba(x_test)
# Confusion matrix in sklearn
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# actual values
actual = y_test
# predicted values
predicted = y_pred
# confusion matrix
matrix = confusion_matrix(actual,predicted, labels=[1,0],sample_weight=None, normalize=None)
print('Confusion matrix : \n', matrix)
# outcome values order in sklearn
tp, fn, fp, tn = confusion_matrix(actual,predicted,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)
# classification report for precision, recall f1-score and accuracy
C_Report = classification_report(actual,predicted,labels=[1,0])
print('Classification report : \n', C_Report)
# calculating the metrics
sensitivity = round(tp/(tp+fn), 3);
specificity = round(tn/(tn+fp), 3);
accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
balanced_accuracy = round((sensitivity+specificity)/2, 3);
precision = round(tp/(tp+fp), 3);
f1Score = round((2*tp/(2*tp + fp + fn)), 3);
# Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1.
# A model with a score of +1 is a perfect model and -1 is a poor model
from math import sqrt
mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)
print('Accuracy :', round(accuracy*100, 2),'%')
print('Precision :', round(precision*100, 2),'%')
print('Recall :', round(sensitivity*100,2), '%')
print('F1 Score :', f1Score)
print('Specificity or True Negative Rate :', round(specificity*100,2), '%' )
print('Balanced Accuracy :', round(balanced_accuracy*100, 2),'%')
print('MCC :', MCC)
# Area under ROC curve
from sklearn.metrics import roc_curve, roc_auc_score
print('roc_auc_score:', round(roc_auc_score(actual, predicted), 3))
# ROC Curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
model_roc_auc = roc_auc_score(actual, predicted)
fpr, tpr, thresholds = roc_curve(actual,modelGNB.predict_proba(x_test)[:,1])
plt.figure()
# plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot(fpr, tpr, label= 'Classification Model' % model_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
print('-----------------------------------------------------------------------------------------------------')
Confusion matrix :
[[ 262 0]
[2131 5245]]
Outcome values :
262 0 2131 5245
Classification report :
precision recall f1-score support
1 0.11 1.00 0.20 262
0 1.00 0.71 0.83 7376
accuracy 0.72 7638
macro avg 0.55 0.86 0.51 7638
weighted avg 0.97 0.72 0.81 7638
Accuracy : 72.1 %
Precision : 10.9 %
Recall : 100.0 %
F1 Score : 0.197
Specificity or True Negative Rate : 71.1 %
Balanced Accuracy : 85.5 %
MCC : 0.279
roc_auc_score: 0.856
-----------------------------------------------------------------------------------------------------
# Load the results file for SVM
EMResults1 = pd.read_excel(r"EMResultsNew.xlsx", header=0)
# Display the first 5 records
EMResults1.head()
| Model Name | True_Positive | False_Negative | False_Positive | True_Negative | Accuracy | Precision | Recall | F1 Score | Specificity | MCC | ROC_AUC_Score | Balanced Accuracy |
|---|
# Training the SVM algorithm with train dataset
from sklearn.svm import SVC
ModelSVM1 = SVC(C=1.0, kernel='linear', degree=3, gamma='scale', coef0=0.0, shrinking=True,
probability=True, tol=0.001, cache_size=200, class_weight=None, verbose=False,
max_iter=- 1, decision_function_shape='ovr', break_ties=False, random_state=None)
# Train the model with train data
ModelSVM1 = ModelSVM1.fit(x_train, y_train)
# Predict the model with test data set
y_pred = ModelSVM1.predict(x_test)
y_pred_prob = ModelSVM1.predict_proba(x_test)
# Print the model name
print('Model Name: ', "SVM - Linear")
# Confusion matrix in sklearn
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# actual values
actual = y_test
# predicted values
predicted = y_pred
# confusion matrix
matrix = confusion_matrix(actual,predicted, labels=[1,0],sample_weight=None, normalize=None)
print('Confusion matrix : \n', matrix)
# outcome values order in sklearn
tp, fn, fp, tn = confusion_matrix(actual,predicted,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)
# classification report for precision, recall f1-score and accuracy
C_Report = classification_report(actual,predicted,labels=[1,0])
print('Classification report : \n', C_Report)
# calculating the metrics
sensitivity = round(tp/(tp+fn), 3);
specificity = round(tn/(tn+fp), 3);
accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
balanced_accuracy = round((sensitivity+specificity)/2, 3);
precision = round(tp/(tp+fp), 3);
f1Score = round((2*tp/(2*tp + fp + fn)), 3);
# Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1.
# A model with a score of +1 is a perfect model and -1 is a poor model
from math import sqrt
mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)
print('Accuracy :', round(accuracy*100, 2),'%')
print('Precision :', round(precision*100, 2),'%')
print('Recall :', round(sensitivity*100,2), '%')
print('F1 Score :', f1Score)
print('Specificity or True Negative Rate :', round(specificity*100,2), '%' )
print('Balanced Accuracy :', round(balanced_accuracy*100, 2),'%')
print('MCC :', MCC)
# Area under ROC curve
from sklearn.metrics import roc_curve, roc_auc_score
print('roc_auc_score:', round(roc_auc_score(actual, predicted), 3))
# ROC Curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
model_roc_auc = roc_auc_score(actual, predicted)
fpr, tpr, thresholds = roc_curve(actual,ModelSVM1.predict_proba(x_test)[:,1])
plt.figure()
# plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot(fpr, tpr, label= 'Classification Model' % model_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
print('-----------------------------------------------------------------------------------------------------')
#---
new_row = {'Model Name' : "SVM - Linear",
'True_Positive' : tp,
'False_Negative' : fn,
'False_Positive' : fp,
'True_Negative' : tn,
'Accuracy' : accuracy,
'Precision' : precision,
'Recall' : sensitivity,
'F1 Score' : f1Score,
'Specificity' : specificity,
'MCC':MCC,
'ROC_AUC_Score':roc_auc_score(actual, predicted),
'Balanced Accuracy':balanced_accuracy}
EMResults1 = EMResults1.append(new_row, ignore_index=True)
#-------------------------------------------------------------------------------------------------------------
Model Name: SVM - Linear
Confusion matrix :
[[ 0 262]
[ 0 7376]]
Outcome values :
0 262 0 7376
Classification report :
precision recall f1-score support
1 0.00 0.00 0.00 262
0 0.97 1.00 0.98 7376
accuracy 0.97 7638
macro avg 0.48 0.50 0.49 7638
weighted avg 0.93 0.97 0.95 7638
Accuracy : 96.6 %
Precision : nan %
Recall : 0.0 %
F1 Score : 0.0
Specificity or True Negative Rate : 100.0 %
Balanced Accuracy : 50.0 %
MCC : nan
roc_auc_score: 0.5
-----------------------------------------------------------------------------------------------------
# display the EMResults
EMResults1.head()
| Model Name | True_Positive | False_Negative | False_Positive | True_Negative | Accuracy | Precision | Recall | F1 Score | Specificity | MCC | ROC_AUC_Score | Balanced Accuracy | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | SVM - Linear | 0 | 262 | 0 | 7376 | 0.966 | NaN | 0.0 | 0.0 | 1.0 | NaN | 0.5 | 0.5 |
# Training the SVM algorithm
from sklearn.svm import SVC
ModelSVMPoly = SVC(kernel='poly', degree=2, probability=True)
# Train the model
ModelSVMPoly.fit(x_train, y_train)
# Predict the model with test data set
y_pred = ModelSVMPoly.predict(x_test)
y_pred_prob = ModelSVMPoly.predict_proba(x_test)
# Print the model name
print('Model Name: ', "SVM - Polynominal")
# Confusion matrix in sklearn
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# actual values
actual = y_test
# predicted values
predicted = y_pred
# confusion matrix
matrix = confusion_matrix(actual,predicted, labels=[1,0],sample_weight=None, normalize=None)
print('Confusion matrix : \n', matrix)
# outcome values order in sklearn
tp, fn, fp, tn = confusion_matrix(actual,predicted,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)
# classification report for precision, recall f1-score and accuracy
C_Report = classification_report(actual,predicted,labels=[1,0])
print('Classification report : \n', C_Report)
# calculating the metrics
sensitivity = round(tp/(tp+fn), 3);
specificity = round(tn/(tn+fp), 3);
accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
balanced_accuracy = round((sensitivity+specificity)/2, 3);
precision = round(tp/(tp+fp), 3);
f1Score = round((2*tp/(2*tp + fp + fn)), 3);
# Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1.
# A model with a score of +1 is a perfect model and -1 is a poor model
from math import sqrt
mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)
print('Accuracy :', round(accuracy*100, 2),'%')
print('Precision :', round(precision*100, 2),'%')
print('Recall :', round(sensitivity*100,2), '%')
print('F1 Score :', f1Score)
print('Specificity or True Negative Rate :', round(specificity*100,2), '%' )
print('Balanced Accuracy :', round(balanced_accuracy*100, 2),'%')
print('MCC :', MCC)
# Area under ROC curve
from sklearn.metrics import roc_curve, roc_auc_score
print('roc_auc_score:', round(roc_auc_score(y_test, y_pred), 3))
# ROC Curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, y_pred)
fpr, tpr, thresholds = roc_curve(y_test,ModelSVMPoly.predict_proba(x_test)[:,1])
plt.figure()
# plt.plot
plt.plot(fpr, tpr, label= 'Classification Model' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
print('-----------------------------------------------------------------------------------------------------')
#---
new_row = {'Model Name' : "SVM - Polynominal",
'True_Positive' : tp,
'False_Negative' : fn,
'False_Positive' : fp,
'True_Negative' : tn,
'Accuracy' : accuracy,
'Precision' : precision,
'Recall' : sensitivity,
'F1 Score' : f1Score,
'Specificity' : specificity,
'MCC':MCC,
'ROC_AUC_Score':roc_auc_score(actual, predicted),
'Balanced Accuracy':balanced_accuracy}
EMResults1 = EMResults1.append(new_row, ignore_index=True)
#-----------------------------------------------------------------------------------------------
Model Name: SVM - Polynominal
Confusion matrix :
[[ 0 262]
[ 0 7376]]
Outcome values :
0 262 0 7376
Classification report :
precision recall f1-score support
1 0.00 0.00 0.00 262
0 0.97 1.00 0.98 7376
accuracy 0.97 7638
macro avg 0.48 0.50 0.49 7638
weighted avg 0.93 0.97 0.95 7638
Accuracy : 96.6 %
Precision : nan %
Recall : 0.0 %
F1 Score : 0.0
Specificity or True Negative Rate : 100.0 %
Balanced Accuracy : 50.0 %
MCC : nan
roc_auc_score: 0.5
-----------------------------------------------------------------------------------------------------
# display the EMResults
EMResults1.head()
| Model Name | True_Positive | False_Negative | False_Positive | True_Negative | Accuracy | Precision | Recall | F1 Score | Specificity | MCC | ROC_AUC_Score | Balanced Accuracy | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | SVM - Linear | 0 | 262 | 0 | 7376 | 0.966 | NaN | 0.0 | 0.0 | 1.0 | NaN | 0.5 | 0.5 |
| 1 | SVM - Polynominal | 0 | 262 | 0 | 7376 | 0.966 | NaN | 0.0 | 0.0 | 1.0 | NaN | 0.5 | 0.5 |
# Training the SVM algorithm
from sklearn.svm import SVC
ModelSVMGaussian = SVC(kernel='rbf', random_state = 42, class_weight='balanced', probability=True)
# Train the model
ModelSVMGaussian.fit(x_train, y_train)
# Predict the model with test data set
y_pred = ModelSVMGaussian.predict(x_test)
y_pred_prob = ModelSVMGaussian.predict_proba(x_test)
# Confusion matrix in sklearn
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# Print the model name
print('Model Name: ', "SVM - Gaussian")
# actual values
actual = y_test
# predicted values
predicted = y_pred
# confusion matrix
matrix = confusion_matrix(actual,predicted, labels=[1,0],sample_weight=None, normalize=None)
print('Confusion matrix : \n', matrix)
# outcome values order in sklearn
tp, fn, fp, tn = confusion_matrix(actual,predicted,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)
# classification report for precision, recall f1-score and accuracy
C_Report = classification_report(actual,predicted,labels=[1,0])
print('Classification report : \n', C_Report)
# calculating the metrics
sensitivity = round(tp/(tp+fn), 3);
specificity = round(tn/(tn+fp), 3);
accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
balanced_accuracy = round((sensitivity+specificity)/2, 3);
precision = round(tp/(tp+fp), 3);
f1Score = round((2*tp/(2*tp + fp + fn)), 3);
# Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1.
# A model with a score of +1 is a perfect model and -1 is a poor model
from math import sqrt
mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)
print('Accuracy :', round(accuracy*100, 2),'%')
print('Precision :', round(precision*100, 2),'%')
print('Recall :', round(sensitivity*100,2), '%')
print('F1 Score :', f1Score)
print('Specificity or True Negative Rate :', round(specificity*100,2), '%' )
print('Balanced Accuracy :', round(balanced_accuracy*100, 2),'%')
print('MCC :', MCC)
# Area under ROC curve
from sklearn.metrics import roc_curve, roc_auc_score
print('roc_auc_score:', round(roc_auc_score(y_test, y_pred), 3))
# ROC Curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, y_pred)
fpr, tpr, thresholds = roc_curve(y_test,ModelSVMGaussian.predict_proba(x_test)[:,1])
plt.figure()
# plt.plot
plt.plot(fpr, tpr, label= 'Classification Model' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
print('-----------------------------------------------------------------------------------------------------')
#---
new_row = {'Model Name' : "SVM - Gaussian",
'True_Positive' : tp,
'False_Negative' : fn,
'False_Positive' : fp,
'True_Negative' : tn,
'Accuracy' : accuracy,
'Precision' : precision,
'Recall' : sensitivity,
'F1 Score' : f1Score,
'Specificity' : specificity,
'MCC':MCC,
'ROC_AUC_Score':roc_auc_score(actual, predicted),
'Balanced Accuracy':balanced_accuracy}
EMResults1 = EMResults1.append(new_row, ignore_index=True)
#---------------------------------------------------------------------------------------------------------------
Model Name: SVM - Gaussian
Confusion matrix :
[[ 257 5]
[1415 5961]]
Outcome values :
257 5 1415 5961
Classification report :
precision recall f1-score support
1 0.15 0.98 0.27 262
0 1.00 0.81 0.89 7376
accuracy 0.81 7638
macro avg 0.58 0.89 0.58 7638
weighted avg 0.97 0.81 0.87 7638
Accuracy : 81.4 %
Precision : 15.4 %
Recall : 98.1 %
F1 Score : 0.266
Specificity or True Negative Rate : 80.8 %
Balanced Accuracy : 89.5 %
MCC : 0.347
roc_auc_score: 0.895
-----------------------------------------------------------------------------------------------------
# display the EMResults
EMResults1.head()
| Model Name | True_Positive | False_Negative | False_Positive | True_Negative | Accuracy | Precision | Recall | F1 Score | Specificity | MCC | ROC_AUC_Score | Balanced Accuracy | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | SVM - Linear | 0 | 262 | 0 | 7376 | 0.966 | NaN | 0.0 | 0.0 | 1.0 | NaN | 0.5 | 0.5 |
| 1 | SVM - Polynominal | 0 | 262 | 0 | 7376 | 0.966 | NaN | 0.0 | 0.0 | 1.0 | NaN | 0.5 | 0.5 |
| 2 | SVM - Gaussian | 257 | 5 | 1415 | 5961 | 0.814 | 0.154 | 0.981 | 0.266 | 0.808 | 0.347 | 0.894539 | 0.895 |
# Training the SVM algorithm
from sklearn.svm import SVC
ModelSVMSig = SVC(kernel='sigmoid', random_state = 42, class_weight='balanced', probability=True)
# Train the model
ModelSVMSig.fit(x_train, y_train)
# Predict the model with test data set
y_pred = ModelSVMSig.predict(x_test)
y_pred_prob = ModelSVMSig.predict_proba(x_test)
# Print the model name
print('Model Name: ', "SVM - Sigmoid")
# Confusion matrix in sklearn
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# actual values
actual = y_test
# predicted values
predicted = y_pred
# confusion matrix
matrix = confusion_matrix(actual,predicted, labels=[1,0],sample_weight=None, normalize=None)
print('Confusion matrix : \n', matrix)
# outcome values order in sklearn
tp, fn, fp, tn = confusion_matrix(actual,predicted,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)
# classification report for precision, recall f1-score and accuracy
C_Report = classification_report(actual,predicted,labels=[1,0])
print('Classification report : \n', C_Report)
# calculating the metrics
sensitivity = round(tp/(tp+fn), 3);
specificity = round(tn/(tn+fp), 3);
accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
balanced_accuracy = round((sensitivity+specificity)/2, 3);
precision = round(tp/(tp+fp), 3);
f1Score = round((2*tp/(2*tp + fp + fn)), 3);
# Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1.
# A model with a score of +1 is a perfect model and -1 is a poor model
from math import sqrt
mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)
print('Accuracy :', round(accuracy*100, 2),'%')
print('Precision :', round(precision*100, 2),'%')
print('Recall :', round(sensitivity*100,2), '%')
print('F1 Score :', f1Score)
print('Specificity or True Negative Rate :', round(specificity*100,2), '%' )
print('Balanced Accuracy :', round(balanced_accuracy*100, 2),'%')
print('MCC :', MCC)
# Area under ROC curve
from sklearn.metrics import roc_curve, roc_auc_score
print('roc_auc_score:', round(roc_auc_score(y_test, y_pred), 3))
# ROC Curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, y_pred)
fpr, tpr, thresholds = roc_curve(y_test,ModelSVMSig.predict_proba(x_test)[:,1])
plt.figure()
# plt.plot
plt.plot(fpr, tpr, label= 'Classification Model' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
print('-----------------------------------------------------------------------------------------------------')
#---
new_row = {'Model Name' : "SVM - Sigmoid",
'True_Positive' : tp,
'False_Negative' : fn,
'False_Positive' : fp,
'True_Negative' : tn,
'Accuracy' : accuracy,
'Precision' : precision,
'Recall' : sensitivity,
'F1 Score' : f1Score,
'Specificity' : specificity,
'MCC':MCC,
'ROC_AUC_Score':roc_auc_score(actual, predicted),
'Balanced Accuracy':balanced_accuracy}
EMResults1 = EMResults1.append(new_row, ignore_index=True)
#-----------------------------------------------------------------------------------------------------------
Model Name: SVM - Sigmoid
Confusion matrix :
[[ 158 104]
[3675 3701]]
Outcome values :
158 104 3675 3701
Classification report :
precision recall f1-score support
1 0.04 0.60 0.08 262
0 0.97 0.50 0.66 7376
accuracy 0.51 7638
macro avg 0.51 0.55 0.37 7638
weighted avg 0.94 0.51 0.64 7638
Accuracy : 50.5 %
Precision : 4.1 %
Recall : 60.3 %
F1 Score : 0.077
Specificity or True Negative Rate : 50.2 %
Balanced Accuracy : 55.2 %
MCC : 0.038
roc_auc_score: 0.552
-----------------------------------------------------------------------------------------------------
# display the EMRseults
EMResults1.head()
| Model Name | True_Positive | False_Negative | False_Positive | True_Negative | Accuracy | Precision | Recall | F1 Score | Specificity | MCC | ROC_AUC_Score | Balanced Accuracy | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | SVM - Linear | 0 | 262 | 0 | 7376 | 0.966 | NaN | 0.0 | 0.0 | 1.0 | NaN | 0.5 | 0.5 |
| 1 | SVM - Polynominal | 0 | 262 | 0 | 7376 | 0.966 | NaN | 0.0 | 0.0 | 1.0 | NaN | 0.5 | 0.5 |
| 2 | SVM - Gaussian | 257 | 5 | 1415 | 5961 | 0.814 | 0.154 | 0.981 | 0.266 | 0.808 | 0.347 | 0.894539 | 0.895 |
| 3 | SVM - Sigmoid | 158 | 104 | 3675 | 3701 | 0.505 | 0.041 | 0.603 | 0.077 | 0.502 | 0.038 | 0.552408 | 0.552 |
# load the concrete dataset
EMResults=pd.read_excel(r"EMResultsNew.xlsx",header=0)
# display the first 5 records
EMResults.head(10)
| Model Name | True_Positive | False_Negative | False_Positive | True_Negative | Accuracy | Precision | Recall | F1 Score | Specificity | MCC | ROC_AUC_Score | Balanced Accuracy |
|---|
# Build the Calssification models and compare the results
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
# Create objects of classification algorithms with default hyper-parameters
ModelLR = LogisticRegression()
ModelDC = DecisionTreeClassifier()
ModelRF = RandomForestClassifier()
ModelET = ExtraTreesClassifier()
ModelKNN = KNeighborsClassifier(n_neighbors=1)
ModelGNB = GaussianNB()
ModelSVMGaussian = SVC(kernel='rbf', random_state = 42, class_weight='balanced', probability=True)
# Evalution matrix for all the algorithms
#MM = [ModelLR, ModelDC, ModelRF, ModelET, ModelKNN, ModelGNB, ModelSVM]
MM = [ModelLR, ModelDC, ModelRF, ModelET, ModelKNN, ModelGNB, ModelSVMGaussian]
for models in MM:
# Train the model training dataset
models.fit(x_train, y_train)
# Prediction the model with test dataset
y_pred = models.predict(x_test)
y_pred_prob = models.predict_proba(x_test)
# Print the model name
print('Model Name: ', models)
# confusion matrix in sklearn
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# actual values
actual = y_test
# predicted values
predicted = y_pred
# confusion matrix
matrix = confusion_matrix(actual,predicted, labels=[1,0],sample_weight=None, normalize=None)
print('Confusion matrix : \n', matrix)
# outcome values order in sklearn
tp, fn, fp, tn = confusion_matrix(actual,predicted,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)
# classification report for precision, recall f1-score and accuracy
C_Report = classification_report(actual,predicted,labels=[1,0])
print('Classification report : \n', C_Report)
# calculating the metrics
sensitivity = round(tp/(tp+fn), 3);
specificity = round(tn/(tn+fp), 3);
accuracy = round((tp+tn)/(tp+fp+tn+fn), 3);
balanced_accuracy = round((sensitivity+specificity)/2, 3);
precision = round(tp/(tp+fp), 3);
f1Score = round((2*tp/(2*tp + fp + fn)), 3);
# Matthews Correlation Coefficient (MCC). Range of values of MCC lie between -1 to +1.
# A model with a score of +1 is a perfect model and -1 is a poor model
from math import sqrt
mx = (tp+fp) * (tp+fn) * (tn+fp) * (tn+fn)
MCC = round(((tp * tn) - (fp * fn)) / sqrt(mx), 3)
print('Accuracy :', round(accuracy*100, 2),'%')
print('Precision :', round(precision*100, 2),'%')
print('Recall :', round(sensitivity*100,2), '%')
print('F1 Score :', f1Score)
print('Specificity or True Negative Rate :', round(specificity*100,2), '%' )
print('Balanced Accuracy :', round(balanced_accuracy*100, 2),'%')
print('MCC :', MCC)
# Area under ROC curve
from sklearn.metrics import roc_curve, roc_auc_score
print('roc_auc_score:', round(roc_auc_score(actual, predicted), 3))
# ROC Curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
Model_roc_auc = roc_auc_score(actual, predicted)
fpr, tpr, thresholds = roc_curve(actual, models.predict_proba(x_test)[:,1])
plt.figure()
#
plt.plot(fpr, tpr, label= 'Classification Model' % Model_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
print('-----------------------------------------------------------------------------------------------------')
#----------------------------------------------------------------------------------------------------------
new_row = {'Model Name' : models,
'True_Positive': tp,
'False_Negative': fn,
'False_Positive': fp,
'True_Negative': tn,
'Accuracy' : accuracy,
'Precision' : precision,
'Recall' : sensitivity,
'F1 Score' : f1Score,
'Specificity' : specificity,
'MCC':MCC,
'ROC_AUC_Score':roc_auc_score(actual, predicted),
'Balanced Accuracy':balanced_accuracy}
EMResults = EMResults.append(new_row, ignore_index=True)
#------------------------------------------------------------------------------
Model Name: LogisticRegression()
Confusion matrix :
[[ 0 262]
[ 0 7376]]
Outcome values :
0 262 0 7376
Classification report :
precision recall f1-score support
1 0.00 0.00 0.00 262
0 0.97 1.00 0.98 7376
accuracy 0.97 7638
macro avg 0.48 0.50 0.49 7638
weighted avg 0.93 0.97 0.95 7638
Accuracy : 96.6 %
Precision : nan %
Recall : 0.0 %
F1 Score : 0.0
Specificity or True Negative Rate : 100.0 %
Balanced Accuracy : 50.0 %
MCC : nan
roc_auc_score: 0.5
-----------------------------------------------------------------------------------------------------
Model Name: DecisionTreeClassifier()
Confusion matrix :
[[ 165 97]
[ 191 7185]]
Outcome values :
165 97 191 7185
Classification report :
precision recall f1-score support
1 0.46 0.63 0.53 262
0 0.99 0.97 0.98 7376
accuracy 0.96 7638
macro avg 0.73 0.80 0.76 7638
weighted avg 0.97 0.96 0.97 7638
Accuracy : 96.2 %
Precision : 46.3 %
Recall : 63.0 %
F1 Score : 0.534
Specificity or True Negative Rate : 97.4 %
Balanced Accuracy : 80.2 %
MCC : 0.521
roc_auc_score: 0.802
-----------------------------------------------------------------------------------------------------
Model Name: RandomForestClassifier()
Confusion matrix :
[[ 146 116]
[ 43 7333]]
Outcome values :
146 116 43 7333
Classification report :
precision recall f1-score support
1 0.77 0.56 0.65 262
0 0.98 0.99 0.99 7376
accuracy 0.98 7638
macro avg 0.88 0.78 0.82 7638
weighted avg 0.98 0.98 0.98 7638
Accuracy : 97.9 %
Precision : 77.2 %
Recall : 55.7 %
F1 Score : 0.647
Specificity or True Negative Rate : 99.4 %
Balanced Accuracy : 77.6 %
MCC : 0.646
roc_auc_score: 0.776
-----------------------------------------------------------------------------------------------------
Model Name: ExtraTreesClassifier()
Confusion matrix :
[[ 128 134]
[ 52 7324]]
Outcome values :
128 134 52 7324
Classification report :
precision recall f1-score support
1 0.71 0.49 0.58 262
0 0.98 0.99 0.99 7376
accuracy 0.98 7638
macro avg 0.85 0.74 0.78 7638
weighted avg 0.97 0.98 0.97 7638
Accuracy : 97.6 %
Precision : 71.1 %
Recall : 48.9 %
F1 Score : 0.579
Specificity or True Negative Rate : 99.3 %
Balanced Accuracy : 74.1 %
MCC : 0.578
roc_auc_score: 0.741
-----------------------------------------------------------------------------------------------------
Model Name: KNeighborsClassifier(n_neighbors=1)
Confusion matrix :
[[ 140 122]
[ 102 7274]]
Outcome values :
140 122 102 7274
Classification report :
precision recall f1-score support
1 0.58 0.53 0.56 262
0 0.98 0.99 0.98 7376
accuracy 0.97 7638
macro avg 0.78 0.76 0.77 7638
weighted avg 0.97 0.97 0.97 7638
Accuracy : 97.1 %
Precision : 57.9 %
Recall : 53.4 %
F1 Score : 0.556
Specificity or True Negative Rate : 98.6 %
Balanced Accuracy : 76.0 %
MCC : 0.541
roc_auc_score: 0.76
-----------------------------------------------------------------------------------------------------
Model Name: GaussianNB()
Confusion matrix :
[[ 262 0]
[2131 5245]]
Outcome values :
262 0 2131 5245
Classification report :
precision recall f1-score support
1 0.11 1.00 0.20 262
0 1.00 0.71 0.83 7376
accuracy 0.72 7638
macro avg 0.55 0.86 0.51 7638
weighted avg 0.97 0.72 0.81 7638
Accuracy : 72.1 %
Precision : 10.9 %
Recall : 100.0 %
F1 Score : 0.197
Specificity or True Negative Rate : 71.1 %
Balanced Accuracy : 85.5 %
MCC : 0.279
roc_auc_score: 0.856
-----------------------------------------------------------------------------------------------------
Model Name: SVC(class_weight='balanced', probability=True, random_state=42)
Confusion matrix :
[[ 257 5]
[1415 5961]]
Outcome values :
257 5 1415 5961
Classification report :
precision recall f1-score support
1 0.15 0.98 0.27 262
0 1.00 0.81 0.89 7376
accuracy 0.81 7638
macro avg 0.58 0.89 0.58 7638
weighted avg 0.97 0.81 0.87 7638
Accuracy : 81.4 %
Precision : 15.4 %
Recall : 98.1 %
F1 Score : 0.266
Specificity or True Negative Rate : 80.8 %
Balanced Accuracy : 89.5 %
MCC : 0.347
roc_auc_score: 0.895
-----------------------------------------------------------------------------------------------------
# display the EMResults
EMResults.head(10)
| Model Name | True_Positive | False_Negative | False_Positive | True_Negative | Accuracy | Precision | Recall | F1 Score | Specificity | MCC | ROC_AUC_Score | Balanced Accuracy | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | LogisticRegression() | 0 | 262 | 0 | 7376 | 0.966 | NaN | 0.0 | 0.0 | 1.0 | NaN | 0.5 | 0.5 |
| 1 | DecisionTreeClassifier() | 165 | 97 | 191 | 7185 | 0.962 | 0.463 | 0.63 | 0.534 | 0.974 | 0.521 | 0.801938 | 0.802 |
| 2 | (DecisionTreeClassifier(max_features='auto', r... | 146 | 116 | 43 | 7333 | 0.979 | 0.772 | 0.557 | 0.647 | 0.994 | 0.646 | 0.775711 | 0.776 |
| 3 | (ExtraTreeClassifier(random_state=1137466445),... | 128 | 134 | 52 | 7324 | 0.976 | 0.711 | 0.489 | 0.579 | 0.993 | 0.578 | 0.74075 | 0.741 |
| 4 | KNeighborsClassifier(n_neighbors=1) | 140 | 122 | 102 | 7274 | 0.971 | 0.579 | 0.534 | 0.556 | 0.986 | 0.541 | 0.760261 | 0.76 |
| 5 | GaussianNB() | 262 | 0 | 2131 | 5245 | 0.721 | 0.109 | 1.0 | 0.197 | 0.711 | 0.279 | 0.855545 | 0.855 |
| 6 | SVC(class_weight='balanced', probability=True,... | 257 | 5 | 1415 | 5961 | 0.814 | 0.154 | 0.981 | 0.266 | 0.808 | 0.347 | 0.894539 | 0.895 |
#predict the values with knn algorithm
y_predKNN = ModelRF.predict(x_test)
#create data frame with actual vs predict values
# display the final results
Results=pd.DataFrame({'is_preparatory_A':y_test,'is_preparatory_P':y_pred})
# Merge two dataframes on the index of both the dataframe
ResultsFinal=data_bk2.merge(Results,left_index=True,right_index=True)
# display 5 records randomly
ResultsFinal.sample(5)
| id | year | institute_type | round_no | quota | pool | institute_short | program_name | program_duration | degree_short | category | opening_rank | closing_rank | is_preparatory | is_preparatory_A | is_preparatory_P | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 12463 | 12464 | 2021 | IIT | 2 | AI | Gender-Neutral | IIT-Kharagpur | Economics | 4 Years | BSc | GEN-EWS | 821 | 981 | 0 | 0 | 0 |
| 11193 | 11194 | 2021 | IIT | 1 | AI | Gender-Neutral | IIT-Jodhpur | Bio Engineering | 4 Years | B.Tech | SC | 2418 | 2724 | 0 | 0 | 1 |
| 14497 | 18679 | 2019 | NIT | 7 | HS | Gender-Neutral | NIT-Agartala | Mathematics and Computing | 5 Years | Int M.Tech | SC | 27407 | 42264 | 0 | 0 | 0 |
| 18430 | 23141 | 2020 | NIT | 6 | OS | Female-Only | NIT-Agartala | Chemical Engineering | 4 Years | B.Tech | GEN-EWS | 9670 | 9670 | 0 | 0 | 0 |
| 16739 | 20920 | 2019 | NIT | 7 | HS | Gender-Neutral | NIT-Rourkela | Chemical Engineering | 5 Years | Btech + M.Tech (IDD) | GEN | 33535 | 36735 | 0 | 0 | 0 |